Source Code of org.terrier.structures.BlockDirectIndex

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BlockDirectIndex.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Douglas Johnson <johnsoda{a.}dcs.gla.ac.uk> (original author)
 *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> 
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
 */
package org.terrier.structures;
import gnu.trove.TIntArrayList;


import java.io.IOException;


import org.apache.log4j.Logger;
import org.terrier.compression.BitIn;
import org.terrier.structures.postings.BlockIterablePosting;
import org.terrier.structures.postings.IterablePosting;
/**
 * Extended direct index that saves both block 
 * and field information about the terms that 
 * appear in a document.
 * @author Douglas Johnson, Vassilis Plachouras
 */
public class BlockDirectIndex extends DirectIndex {
    protected int DocumentBlockCountDelta = 1;
    
  /** The logger used */
  private static Logger logger = Logger.getLogger(BlockDirectIndex.class);
  /**
   * Constructs an instance of the class with 
   * the given index, using the specified structure name.
   * @param index The index to be used
   * @param structureName the name of this direct index
   * @throws IOException 
   */
  public BlockDirectIndex(Index index, String structureName) throws IOException {
    super(index, structureName, BlockIterablePosting.class);
  }
  /**
   * Constructs an instance of the class with
   * @param index
   * @param structureName
   * @param postingClass
   * @throws IOException
   */
  public BlockDirectIndex(Index index, String structureName,
      Class<? extends IterablePosting> postingClass) throws IOException 
  {
    super(index, structureName, postingClass);
  }
  
  /**
   * Returns a five dimensional array containing the 
   * term ids and the term frequencies for the given document. 
   * @return int[][] a five dimensional array containing 
   *         the term ids, frequencies, field scores, 
   *         block frequencies and the containing the block ids.
   * @param docid the id of the document whose terms we are looking for.
   */
  public int[][] getTerms(int docid) throws IOException
  {
    DocumentIndexEntry de = docIndex.getDocumentEntry(docid);
    if (de == null)
      return null;
    if (de.getNumberOfEntries() == 0)
      return null;
    return getTerms(de);
  }
  
  /** 
   * {@inheritDoc} 
   */    
  public int[][] getTerms(BitIndexPointer pointer) throws IOException {
    final long startOffset = pointer.getOffset();
    final byte startBitOffset = pointer.getOffsetBits();
    final int df = pointer.getNumberOfEntries();
    
    final boolean loadTagInformation = fieldCount > 0;
    
    final int[][] documentTerms = new int[4+fieldCount][];
    for(int i=0;i<fieldCount+3;i++)
      documentTerms[i] = new int[df];
    final TIntArrayList blockids = new TIntArrayList(df); //ideally we'd have TF here


    try{
      final BitIn file = this.file[pointer.getFileNumber()].readReset(startOffset, startBitOffset);
  
      if (loadTagInformation) { //if there are tag information to process
        //documentTerms[2] = new int[df]; 
        documentTerms[0][0] = file.readGamma() - 1;        
        documentTerms[1][0] = file.readUnary();
        for(int fi=0;fi < fieldCount;fi++)
          documentTerms[2+fi][0] = file.readUnary() -1;
        int blockfreq = documentTerms[2+fieldCount][0] = file.readUnary() - DocumentBlockCountDelta;
        int tmpBlocks[] = new int[blockfreq];
        int previousBlockId = -1;
        for(int j=0;j<blockfreq;j++)
        {
          tmpBlocks[j] = previousBlockId = file.readGamma() + previousBlockId;
        }
        blockids.add(tmpBlocks);
        
        for (int i = 1; i < df; i++) {          
          documentTerms[0][i]  = file.readGamma() + documentTerms[0][i - 1];
          documentTerms[1][i]  = file.readUnary();
          for(int fi=0;fi < fieldCount;fi++)
            documentTerms[2+fi][0] = file.readUnary() -1;
          blockfreq = documentTerms[2+fieldCount][i] = file.readUnary() - DocumentBlockCountDelta;
          tmpBlocks = new int[blockfreq];
          previousBlockId = -1;
          for(int j=0;j<blockfreq;j++)
          {
            tmpBlocks[j] = previousBlockId = file.readGamma() + previousBlockId;
          }
          blockids.add(tmpBlocks);
        }
      } else { //no tag information to process          
        
        documentTerms[0][0] = file.readGamma() - 1;
        documentTerms[1][0] = file.readUnary();
        
        int blockfreq = documentTerms[2][0] = file.readUnary() - DocumentBlockCountDelta;
        int tmpBlocks[] = new int[blockfreq];
        int previousBlockId = -1;
        for(int j=0;j<blockfreq;j++)
        {
          tmpBlocks[j] = previousBlockId = file.readGamma() + previousBlockId;
        }
        blockids.add(tmpBlocks);
        
        for (int i = 1; i < df; i++) {          
          documentTerms[0][i]  = file.readGamma() + documentTerms[0][i - 1];
          documentTerms[1][i]  = file.readUnary();


          blockfreq = documentTerms[2][i] = file.readUnary() - DocumentBlockCountDelta;
          tmpBlocks = new int[blockfreq];
          previousBlockId = -1;
          for(int j=0;j<blockfreq;j++)
          {
            tmpBlocks[j] = previousBlockId = file.readGamma() + previousBlockId;
          }
          blockids.add(tmpBlocks);
        }
      }
      documentTerms[documentTerms.length-1] = blockids.toNativeArray();
      return documentTerms;
    } catch (IOException ioe) {
      logger.error("Problem reading block inverted index", ioe);
      return null;
    }


  }
}
Source Code of org.terrier.structures.BlockDirectIndex

Related Classes of org.terrier.structures.BlockDirectIndex